Daten Importieren und vorbereiten¶

In [ ]:
# import functions
import pandas as pd

# Read enriched dataset
data_enriched = pd.read_csv("data_enriched.csv", parse_dates=['Zeitstempel'])
In [ ]:
# Zeitraum der Daten
print(data_enriched['Zeitstempel'].min())
print(data_enriched['Zeitstempel'].max())

# Drop ArtikelNr und Zeitstempel
data_enriched = data_enriched.drop(['ArtikelNr', 'Zeitstempel', 'Menge_log'], axis=1)
2018-01-02 00:00:00
2022-02-22 00:00:00
In [ ]:
# Alle Artikel
all_articles = data_enriched['Artikel'].value_counts().index.tolist()
all_articles
Out[ ]:
['Laugen-Gipfel          of',
 'Butter-Gipfel / Croissant of',
 'caffè crema',
 'St. Galler Handbürli Culinarium *Gold prämiert*',
 'Körnergipfel of',
 'Weggli',
 'Mais-Gipfel of',
 'Sonnenblumenbrötli  of',
 'St. Galler Handb dk. Culinarium *Gold prämiert*',
 'Semmel                 of',
 'Nuss-Stengel Original',
 'Pain Roule rustico klein  of',
 'Gallusbrot 400   of',
 'Mais-Brötli süss   of',
 'Berliner m Confi Himbeer of',
 'Erdbeertörtli gross    ',
 'Butterzopf       440   2-teilig ',
 'Ziger-Krapfen          of']

Alle Modelle Fine Tunen¶

In [ ]:
# Funktionen importieren
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
In [ ]:
# Function to calculate the score
def calculate_score(result):
    # Calculate score
    score = (0.6 * result['R2'] * (1 - result['MAPE']) * (1 / result['RMSE'])) + (0.2 * (1 - result['MAPE']) * (1 / result['RMSE'])) + (0.2 * result['R2'])
    return score
In [ ]:
# Create dictionary for best_models
best_models = {}
# Dateframe for all metrics
all_results = pd.DataFrame()

# Define hyperparameters for random search
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 50],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 50],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.5, 1.0],
    'colsample_bytree': [0.4, 1.0],
    'min_child_weight': [1, 4]
}

# Loop over articles
for artikel in all_articles:

    # Filter by Artikel
    data_filtered = data_enriched[data_enriched['Artikel'] == artikel].drop(['Artikel'], axis=1)

    X = data_filtered.drop(['Menge'], axis=1)
    y = data_filtered['Menge']

    # Split data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4153)

    models = [
        ('LinearRegression', LinearRegression(), {}),
        ('RandomForestRegressor', RandomForestRegressor(), rf_params),
        ('XGBRegressor', XGBRegressor(), xgb_params)
    ]

    # Initialize a variable to keep track of the best RMSE and corresponding model
    best_rmse = float('inf')
    best_model_info = None

    for name, model, params in models:

        if params:
            # Hyperparameter tuning using RandomizedSearchCV
            model_cv = RandomizedSearchCV(model, params, cv=5, n_iter=10, random_state=4153)
            model_cv.fit(X_train, y_train)
            best_model = model_cv.best_estimator_
        else:
            best_model = model
            best_model.fit(X_train, y_train)

        # Predict y
        y_pred = best_model.predict(X_test)

        # Calculate metrics
        r2 = r2_score(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mape = mean_absolute_percentage_error(y_test, y_pred)

        # Save results in a dictionary
        result = {'Artikel': artikel, 'R2': r2, 'RMSE': rmse, 'MAPE': mape, 'modelname': name}

        # Calculate score
        result['score'] = calculate_score(result)

        # Add to results dataframe
        all_results = pd.concat([all_results, pd.DataFrame([result])], ignore_index=True)    

        # If it's the best model so far, save it
        if rmse < best_rmse:
            best_rmse = rmse
            best_model_info = {'model': best_model, 'metrics': result}
    
    # Save the best model for this artikel
    best_models[artikel] = best_model_info
In [ ]:
import pickle

# Saving the objects:
with open('best_models.pkl', 'wb') as f:  
    pickle.dump(best_models, f)

Plot Results¶

Our Score¶

In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.barplot(data=all_results, x='Artikel', y='score', hue='modelname')
plt.title('Scores for each Artikel')
plt.ylabel('Score')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()

R2¶

In [ ]:
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results, x='Artikel', y='R2', hue='modelname')
plt.title('R2 for each Artikel')
plt.ylabel('R2')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()

RMSE¶

In [ ]:
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results, x='Artikel', y='RMSE', hue='modelname')
plt.title('RMSE for each Artikel')
plt.ylabel('RMSE')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()

Variable Importance¶

In [ ]:
from functions import plot_variable_importance

for artikel, models in best_models.items():
    model = models['model']
    model_name = models['metrics']['modelname']

    plot_variable_importance(model, X_train, model_name, artikel)

Try Voting Regressor with best models¶

In [ ]:
from sklearn.ensemble import VotingRegressor

# Create dictionary for best_models
best_models_voting = {}

# Dateframe for all metrics
all_results_voting = pd.DataFrame()

# Loop over articles
for artikel in all_articles:

    # Filter by Artikel
    data_filtered = data_enriched[data_enriched['Artikel'] == artikel].drop(['Artikel'], axis=1)

    X = data_filtered.drop(['Menge'], axis=1)
    y = data_filtered['Menge']

    # Split data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4153)

    models = [
        ('LinearRegression', LinearRegression(), {}),
        ('RandomForestRegressor', RandomForestRegressor(), rf_params),
        ('XGBRegressor', XGBRegressor(), xgb_params)
    ]

    # Initialize a variable to keep track of the best RMSE and corresponding model
    best_rmse = float('inf')
    best_model_info = None

    # Initialize list for base estimators of VotingRegressor
    estimators = []

    for name, model, params in models:

        if params:
            # Hyperparameter tuning using RandomizedSearchCV
            model_cv = RandomizedSearchCV(model, params, cv=5, n_iter=10, random_state=4153)
            model_cv.fit(X_train, y_train)
            best_model = model_cv.best_estimator_
        else:
            best_model = model
            best_model.fit(X_train, y_train)

        # Add the model to the estimators list
        estimators.append((name, best_model))

        # Predict y
        y_pred = best_model.predict(X_test)

        # Calculate metrics
        r2 = r2_score(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mape = mean_absolute_percentage_error(y_test, y_pred)

        # Save results in a dictionary
        result = {'Artikel': artikel, 'R2': r2, 'RMSE': rmse, 'MAPE': mape, 'modelname': name}

        # Calculate score
        result['score'] = calculate_score(result)

        # Add to results dataframe
        all_results_voting = pd.concat([all_results_voting, pd.DataFrame([result])], ignore_index=True)    

        # If it's the best model so far, save it
        if rmse < best_rmse:
            best_rmse = rmse
            best_model_info = {'model': best_model, 'metrics': result}

    # Voting Regressor
    voting_regressor = VotingRegressor(estimators)
    voting_regressor.fit(X_train, y_train)

    # Predict y
    y_pred = voting_regressor.predict(X_test)

    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    # Save results in a dictionary
    result = {'Artikel': artikel, 'R2': r2, 'RMSE': rmse, 'MAPE': mape, 'modelname': 'VotingRegressor'}

    # Calculate score
    result['score'] = calculate_score(result)

    # Add to results dataframe
    all_results_voting = pd.concat([all_results_voting, pd.DataFrame([result])], ignore_index=True)    
    
    # If it's the best model so far, save it
    if rmse < best_rmse:
        best_rmse = rmse
        best_model_info = {'model': voting_regressor, 'metrics': result}

    # Save the best model and metrics for this artikel
    best_models_voting[artikel] = best_model_info
In [ ]:
import pickle

# Saving the objects:
with open('best_models_voting.pkl', 'wb') as f:  
    pickle.dump(best_models_voting, f)

Plot results¶

Our Score¶

In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.barplot(data=all_results_voting, x='Artikel', y='score', hue='modelname')
plt.title('Scores for each Artikel')
plt.ylabel('R2')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()

R2¶

In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.barplot(data=all_results_voting, x='Artikel', y='R2', hue='modelname')
plt.title('R2 for each Artikel')
plt.ylabel('R2')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()

RMSE¶

In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.barplot(data=all_results_voting, x='Artikel', y='RMSE', hue='modelname')
plt.title('RMSE for each Artikel')
plt.ylabel('RMSE')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()

Plot Variable Importance¶

In [ ]:
from functions import plot_variable_importance

for artikel, models in best_models_voting.items():
    model = models['model']
    model_name = models['metrics']['modelname']

    plot_variable_importance(model, X_train, model_name, artikel)
No feature importances or coefficients available for model Estimator 0
No feature importances or coefficients available for model Estimator 0
No feature importances or coefficients available for model Estimator 0